********************************************

do "E:/ReplicateBuild/02_code/00_environment/00_set_environment.do"

*Table of Contents
local hrmsAppData = 1
local FOCALElemSampleData = 1
local tchData = 1
local tchYearData = 1
local ncerdc_principal_panel = 1
local schoolYearData = 1
local applicantData = 1
local applicantYearData = 1
local vacanciesData = 1
local applicationsData = 1
	local dataCheck = 1
	
********************************************************************************
* Read in statistics from applying in HRMS
********************************************************************************

if `hrmsAppData' == 1 {
	********************************************************************************
	* (tempfile) Teacher-year statistics on applying in hrms
	* Identifier: ncerdc_id
	********************************************************************************
	
	use "$basedata/hrms_district_apps", clear
	gen FOCAL_lea_cd = (lea_cd==XXX) // hiding district
	gen non_FOCAL_lea_cd = (lea_cd!=XXX) // hiding district
	collapse (max) FOCAL_lea_cd non_FOCAL_lea_cd, by(hrms_app_id)
	
	tempfile temphrmslea
	save `temphrmslea', replace
	
	use "$basedata/hrms_school_apps", clear
	
	sort hrms_app_id
	
	merge n:1 hrms_app_id using `temphrmslea'
	drop if _m==2
	drop _m
	
	merge n:1 hrms_app_id using "$basedata/hrms_applicants", keepusing(app_avail_dte app_mod_dte app_create_dte app_timestamp ncerdc_id)
	drop if _m==2
	drop _m
	
	sort hrms_job_id vac_lea_id
	
	merge n:1 hrms_job_id vac_lea_id using "$basedata/hrms_vacancies", keepusing(position_available_date vacancy_create_date vacancy_status_change_date vacancy_closing_date)
	drop if _m==2
	drop _m
	
	gen hrms_app_year = year(position_available_date)
	keep if hrms_app_year>=2011 & hrms_app_year<=2016
	
	forv yy=2011/2016 {
		gen hrms_app_`yy' = (hrms_app_year==`yy')
	}
	
	drop if ncerdc_id==.
	
	collapse (max) hrms_app_20* FOCAL_lea_cd non_FOCAL_lea_cd, by(ncerdc_id)
	
	
	tempfile temphrms
	save `temphrms', replace
}

********************************************************************************
* Read in schools data to generate elementary FOCAL sample
********************************************************************************

if `FOCALElemSampleData' == 1 {
	********************************************************************************
	* (tempfile) List of FOCAL elementary schools
	* Identifier: ncerdc_lea ncerdc_schlcode
	********************************************************************************
	
	use "$basedata/ncerdc_ccd", clear
	
	keep if ncerdc_lea=="XXX"  /* restrict to FOCAL */ // hiding district
	drop if grade_low=="06"
	drop if grade_low=="07"
	drop if grade_low=="08"
	drop if grade_low=="09"
	drop if grade_low=="10"
	drop if grade_low=="11"
	drop if grade_low=="M"
	drop if grade_low=="N"
	drop if grade_low=="UG"
	drop if grade_high=="13"
	drop if grade_high=="12"
	drop if grade_high=="08"
	drop if grade_high=="07"
	
	keep ncerdc_lea ncerdc_schlcode
	destring ncerdc_lea ncerdc_schlcode, replace
	
	sort ncerdc_lea ncerdc_schlcode
	
	duplicates drop
	
	tempfile tempsample
	save `tempsample', replace
	
}	
	
	
********************************************************************************
* Combine teacher data -- FOCAL bio + PRAXIS + NCERDC demographics
********************************************************************************

if `tchData' == 1 {
	********************************************************************************
	* Create datasets of all teachers -- one for FOCAL only, one for all.
	* Identifier: (1) FOCALEmployeeID
	* Identifier: (2) ncerdc_id
	********************************************************************************
	
	use "$basedata/FOCAL_teacher_bio", clear
	
	sort ncerdc_id
	merge n:1 ncerdc_id using "$basedata/ncerdc_demographics" // we have FOCAL teachers that share an ncerdc id
	drop if _m==2 // a decent percentage of master doesn't match
	drop _m
	
	merge n:1 ncerdc_id using `temphrms'
	drop if _m==2
	drop _m
	
	save "$basedata/FOCAL_teacher_data", replace
	
	use "$basedata/ncerdc_demographics"

	merge n:1 ncerdc_id using `temphrms'
	drop if _m==2
	drop _m
	
	duplicates report ncerdc_id 
	
	save "$basedata/allLEA_teacher_data", replace
}

********************************************************************************
* Combine teacher-year data -- salary/experience + EVAAS + VA
********************************************************************************

if `tchYearData' == 1 {
	********************************************************************************
	* Create datasets of all teacher-years -- one for FOCAL only, one for all.
	* Identifier (1): FOCALEmployeeID sy
	* Identifier (2): ncerdc_id sy
	********************************************************************************
	
	use "$basedata/FOCAL_employees_crosswalk", clear
	
	sort ncerdc_id
	
	joinby ncerdc_id using "$basedata/ncerdc_experience", unmatched(both)
	drop _merge
	
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `tempsample'
	assert _m!=2
	gen elem_school_sample = (_m==3)
	drop _m
	
	foreach suff in "" "_white" "_achHi_ma" {
	    
	if "`suff'"=="" {
	    local suff2 = ""
	}
	if "`suff'"=="_white" {
	    local suff2 = "_race"
	}
	if "`suff'"=="_achHi_ma" {
	    local suff2 = "_ach"
	}
	
	preserve
		use "$basedata/va_estimates_drift`suff'", clear
		
		ren (j t) (ncerdc_id sy)
		
		sort ncerdc_id sy
			
		collapse (sum) n_ct_* (max) multi_school_* (mean) p_m*_* mu_*_m*_hat_* Abar* vijt*, by(ncerdc_id sy)
		drop if ncerdc_id==.
		
		ren (n_ct_*) (n_ct_*`suff2')
		ren (multi_school_*) (multi_school_*`suff2')
		ren (p_m*_*) (p_m*_*`suff2')
		ren (mu_*_m*_hat_*) (mu_*_m*_hat_*`suff2')
		ren (Abar*) (Abar*`suff2')
		ren (vijt*)  (vijt*`suff2')
		
		tempfile tempva_shrunk`suff2'
		save `tempva_shrunk`suff2'', replace
	restore
	}
	
	foreach suff2 in "" "_race" "_ach" {
		sort ncerdc_id sy
		merge n:1 ncerdc_id sy using `tempva_shrunk`suff2''
		drop if _m==2
		gen va_merge`suff2' = _m
		drop _m
	}


	merge n:1 ncerdc_id using `temphrms'
	drop if _m==2
	drop _m
	
	gen hrms_app = .
	forvalues yy=2011/2016 {
		replace hrms_app = hrms_app_`yy' if sy==`yy'
		drop hrms_app_`yy'
	}
	
	sort FOCALEmployeeID sy
	merge n:1 FOCALEmployeeID sy using "$basedata/FOCAL_teacher_year_evaas"
	drop if _m==2
	drop _m
	
	
	preserve
		keep if FOCALEmployeeID!=.
		sort FOCALEmployeeID sy
		duplicates report FOCALEmployeeID sy
		
		merge m:1 ncerdc_lea ncerdc_schlcode sy using "$basedata/ncerdc_principals"
		drop if _merge==2
		drop _merge

		save "$basedata/FOCAL_teacher_year_data", replace
	restore

	
	keep if ncerdc_id!=.
	sort ncerdc_id sy FOCALEmployeeID
	duplicates tag ncerdc_id sy, gen(dupvar)
	assert dupvar==0 | dupvar==1
	gen FOCALEmployeeID2 = FOCALEmployeeID[_n+1] if dupvar==1
	drop if ncerdc_id==ncerdc_id[_n-1] & sy==sy[_n-1]
	duplicates report ncerdc_id sy
	drop dupvar
	
	* create transfer variable
	bys ncerdc_id: egen ever_multi = max(multi_school)
	bys ncerdc_id sy: egen num_obs = count(ever_multi)
	bys ncerdc_id: egen max_numobs = max(num_obs)
	
	sort ncerdc_id sy 
	gen transfer_salary = ncerdc_lea!=ncerdc_lea[_n-1] | ncerdc_schlcode!=ncerdc_schlcode[_n-1] if ncerdc_id==ncerdc_id[_n-1] & sy==sy[_n-1]+1 & ncerdc_schlcode!=. & ncerdc_schlcode[_n-1]!=. & ncerdc_schlcode!=0 & ncerdc_schlcode[_n-1]!=0 & ever_multi==0 & max_numobs==1
	
	* fill in school and lea codes
	gen lea_salary = ncerdc_lea
	gen schlcode_salary = ncerdc_schlcode
	

	
	sort ncerdc_id sy
	gen transfer = ncerdc_lea!=ncerdc_lea[_n-1] | ncerdc_schlcode!=ncerdc_schlcode[_n-1] if ncerdc_id==ncerdc_id[_n-1] & sy>sy[_n-1] & ncerdc_schlcode!=. & ncerdc_schlcode[_n-1]!=. & ncerdc_schlcode!=0 & ncerdc_schlcode[_n-1]!=0 & ever_multi==0 & max_numobs==1
	
	****************************************************************************
	*Clean tchr_exp_pay_level
	****************************************************************************
	*Flag for if we adjusted tchr_exp_pay_level.'
	gen mod_tchr_exp_pay_level = 0
	
	*Count how many people are missing tchr_exp_pay_level for all years. 
	sort ncerdc_id 
	bysort ncerdc_id: egen missing_pay_level = min(missing(tchr_exp_pay_level))
	count if missing_pay_level
	unique ncerdc_id if missing_pay_level
	drop missing_pay_level

	
	*Use present tchr_exp_pay_level for given person to fill in missing tchr_exp_pay_level
	sort ncerdc_id sy 
	gen last_pay_record = tchr_exp_pay_level if !missing(tchr_exp_pay_level)
	gen last_pay_record_sy = sy if !missing(tchr_exp_pay_level)

	replace last_pay_record = last_pay_record[_n-1] if ncerdc_id == ncerdc_id[_n-1] & last_pay_record[_n-1]!=. & last_pay_record==. 
	replace last_pay_record_sy = last_pay_record_sy[_n-1] if ncerdc_id == ncerdc_id[_n-1] & last_pay_record_sy[_n-1]!=. & last_pay_record_sy==. 

	gen next_pay_record = tchr_exp_pay_level if !missing(tchr_exp_pay_level)
	gen next_pay_record_sy = sy if !missing(tchr_exp_pay_level)

	forval i = 1/11{
	   replace next_pay_record = tchr_exp_pay_level[_n+`i'] if ncerdc_id == ncerdc_id[_n+`i'] & next_pay_record == . & tchr_exp_pay_level[_n+`i']!=.
	   replace next_pay_record_sy = sy[_n+`i'] if ncerdc_id == ncerdc_id[_n+`i'] & next_pay_record_sy == . & tchr_exp_pay_level[_n+`i']!=.
	}
	
	*Fill in missing tchr_exp_pay_level when missing is between observable tchr_exp_pay_levels and gap is linear.
	replace mod_tchr_exp_pay_level = 1 if missing(tchr_exp_pay_level) & next_pay_record_sy-last_pay_record_sy == next_pay_record - last_pay_record & !missing(next_pay_record_sy-last_pay_record_sy)
	replace tchr_exp_pay_level = last_pay_record + (sy-last_pay_record_sy) if missing(tchr_exp_pay_level) & next_pay_record_sy-last_pay_record_sy == next_pay_record - last_pay_record & !missing(next_pay_record_sy-last_pay_record_sy)
	
	*Fill in tchr_exp_pay_level when missing is before first observable tchr_exp_pay_level
	replace mod_tchr_exp_pay_level = 1 if missing(tchr_exp_pay_level) & missing(last_pay_record)
	replace tchr_exp_pay_level = next_pay_record - (next_pay_record_sy-sy) if missing(tchr_exp_pay_level) & missing(last_pay_record)
	*Small amount go negative with this modification, so we force them to be 0.
	replace tchr_exp_pay_level = 0 if tchr_exp_pay_level < 0

	*Fill in tchr_exp_pay_level when missing is after last observable tchr_exp_pay_level
	replace mod_tchr_exp_pay_level = 1 if missing(tchr_exp_pay_level) & missing(next_pay_record)
	replace tchr_exp_pay_level = last_pay_record + (sy-last_pay_record_sy) if missing(tchr_exp_pay_level) & missing(next_pay_record)
	
	drop last_pay_record last_pay_record_sy next_pay_record next_pay_record_sy
	*Remaining missings fall into two categories:
	*(A) Missing occurs for person who is missing tchr_exp_pay_level in ever year.
	*(B) Missing occurs for between two observable tchr_exp_pay_level (one before, one after) for a person, but interval implies experience growth higher or lower than 1 per year.

	*Check for non-monotonities occuring for tchr_exp_pay_level
	sort ncerdc_id sy

	gen non_mono = 0

	forval i = 1/10{
	   replace non_mono = 1 if ncerdc_id == ncerdc_id[_n+`i'] & tchr_exp_pay_level > tchr_exp_pay_level[_n+`i'] & tchr_exp_pay_level[_n+`i']!=. & tchr_exp_pay_level!=.
	}

	bysort ncerdc_id: egen non_mono_tchr_exp_pl = max(non_mon)
	drop non_mono
	*There are some non-monotonities, but they seem minimal.
	****************************************************************************
	
	****************************************************************************
	*Clean masters pay
	****************************************************************************
	sort ncerdc_id sy
	replace educ_pay_level = educ_pay_level[_n-1] if missing(educ_pay_level) & ncerdc_id == ncerdc_id[_n-1] & educ_pay_level[_n-1] !=.
	replace educ_pay_level = educ_pay_level[_n-1] if educ_pay_level < educ_pay_level[_n-1] &  ncerdc_id == ncerdc_id[_n-1] & educ_pay_level[_n-1] !=.


	
	drop ever_multi num_obs max_numobs
	
	preserve 
		use "$basedata/ncerdc_education", clear
		
		gen graduation_year = year(graduation_date)
		drop if graduation_year < 1958 | graduation_year > 2019
		
		gen bachelors_graduation = graduation_date if educational_attainment == 4
		gen masters_graduation = graduation_date if educational_attainment == 5
		gen advanced_graduation = graduation_date if educational_attainment == 6
		gen doctorate_graduation = graduation_date if educational_attainment == 7
		
		local degrees "bachelors masters advanced doctorate"
		foreach d in `degrees'{
			gen `d'_graduation_year = year(`d'_graduation)
			gen `d'_graduation_month = month(`d'_graduation)
			
			*Get sy for which degree level would be used for pay
			gen  `d'_graduation_sy = `d'_graduation_year + 1
			replace `d'_graduation_sy = `d'_graduation_year if `d'_graduation_month < 5
		}
		
		collapse (min) *_graduation_sy, by(ncerdc_id)
		
		tempfile temp_degrees
		save `temp_degrees', replace
	restore
		
	merge m:1 ncerdc_id using `temp_degrees'
	drop if _merge==2
	drop _merge
	
	gen educ_lvl = .
	replace educ_lvl = 4 if bachelors_graduation_sy <= sy 
	replace educ_lvl = 5 if masters_graduation_sy <= sy 
	replace educ_lvl = 6 if advanced_graduation_sy <= sy 
	replace educ_lvl = 7 if doctorate_graduation_sy <= sy 
	
	label define educ_labels 4 "Bachelors" 5 "Masters" 6 "Advanced" 7 "Doctorate" 
	label values educ_lvl educ_labels
	
	drop *_graduation_sy
	
	*TFA Info from apps.
	preserve
		use "$basedata/FOCAL_applicant_account_info", clear
		keep applicant_id ncerdc_id
		isid applicant_id
		drop if ncerdc_id == .
		
		tempfile temp_applicant_ncerdc_ids
		save `temp_applicant_ncerdc_ids', replace
	restore
	
	preserve 
		use "$basedata/FOCAL_applications.dta", clear
		keep if tfa_app == 1
		merge m:1 applicant_id using `temp_applicant_ncerdc_ids'
		keep if _merge == 3
		drop _merge
		
		collapse (min) app_year tfa_app, by(ncerdc_id)
		
		gen tfa = tfa_app 
		gen tfa_app_year = app_year
		
		tempfile temp_tfa_info
		save `temp_tfa_info', replace
	restore
		
	merge m:1 ncerdc_lea ncerdc_schlcode sy using "$basedata/ncerdc_principals"
	drop if _merge==2
	drop _merge
	
	tostring(ncerdc_lea), replace
	tostring(ncerdc_schlcode), replace
	
	merge m:1 ncerdc_lea ncerdc_schlcode sy using "$basedata/ncerdc_ccd"
	drop if _merge==2
	drop _merge
	
	merge m:1 ncerdc_id using `temp_tfa_info'
	drop if _merge == 2
	drop _merge
	
	save "$basedata/allLEA_teacher_year_data", replace

}


********************************************************************************
* Construct NCERDC experience for employees who become principals
********************************************************************************
if `ncerdc_principal_panel' == 1{
	use "$basedata/ncerdc_principals", clear
	keep ncerdc_id
	duplicates drop
	
	tempfile temp_ncerdc_principals
	save `temp_ncerdc_principals', replace
	
	use "$basedata/allLEA_teacher_year_data.dta", clear
	merge m:1 ncerdc_id using `temp_ncerdc_principals'
	keep if _merge==3
	drop _merge
	
	save "$basedata/allLEA_principal_panel.dta", replace
}

********************************************************************************
* Combine school-year data -- common core
********************************************************************************

if `schoolYearData' == 1 {
	********************************************************************************
	* Create dataset of all school-years.
	* Identifier: ncerdc_lea ncerdc_schlcode sy
	********************************************************************************
	
	foreach suff in "" "_white" "_achHi_ma" {
		
	if "`suff'"=="" {
	    local suff2 = ""
	}
	if "`suff'"=="_white" {
	    local suff2 = "_race"
	}
	if "`suff'"=="_achHi_ma" {
	    local suff2 = "_ach"
	}
		
	use "$basedata/va_estimates_drift`suff'", clear 
	
	ren (lea schlcode) (ncerdc_lea ncerdc_schlcode)
	collapse (mean) mean_n1_ma=n_ct_m1_ma mean_n2_ma=n_ct_m2_ma (sum) n_ct*, by(ncerdc_lea ncerdc_schlcode)
	
	gen p_school_m1_ma = n_ct_m1_ma / (n_ct_m1_ma + n_ct_m2_ma)
	gen p_school_m2_ma = 1-p_school_m1_ma

	
	ren (mean_*) (mean_*`suff2')
	ren (n_ct*) (n_ct*`suff2')
	ren (p_school_*) (p_school_*`suff2')
	
	tempfile temptypes`suff2'
	save `temptypes`suff2'', replace
	
	}
	
	
	use "$basedata/ncerdc_ccd", clear
	
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `temptypes'
	drop if _m==2
	drop _m
	
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `temptypes_race'
	drop if _m==2
	drop _m
	
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `temptypes_ach'
	drop if _m==2
	drop _m
	
	gen ncerdc_lea_str = ncerdc_lea
	gen ncerdc_schlcode_str = ncerdc_schlcode
	
	destring ncerdc_lea, force replace
	destring ncerdc_schlcode, force replace
	
	drop if ncerdc_lea==.
	
	sort ncerdc_lea ncerdc_schlcode sy
	
	merge 1:1 ncerdc_lea ncerdc_schlcode sy using "$basedata/ncerdc_report_cards"
	tab _merge
	drop _merge
	
	bys ncerdc_lea ncerdc_schlcode: egen everTitleI = max(titleI)
	
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `tempsample'
	assert _m!=2
	gen elem_school_sample = (_m==3)
	drop _m
	
	preserve
		use "$basedata/FOCAL_applications", clear
		keep if tlp == 1
		bysort ncerdc_lea ncerdc_schlcode: egen sy = min(app_year)
		
		destring(ncerdc_lea), replace
		
		*First wave of LIFT schools not properly identified due to way apps were initially handled // hiding names and values for confidentiality
		/*
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		replace sy = 2013 if ncerdc_lea==XXX & ncerdc_schlcode==XXX //
		*/
		collapse (sum) tlp, by(ncerdc_lea ncerdc_schlcode sy)
		drop if missing(ncerdc_lea) | missing(ncerdc_schlcode)
		
		
		tempfile temptlp
		save `temptlp', replace
	restore
	
	merge 1:1 ncerdc_lea ncerdc_schlcode sy using `temptlp'
	*_merge==2 checked. Only years 2019, 2020 for which we dont have school year data
	*or non school locations.
	drop if _merge==2
	gen tlp_first_year = _merge == 3
	drop _merge
	
	bysort ncerdc_lea ncerdc_schlcode: egen tlp_ever = max(tlp_first_year)
	
	sort ncerdc_lea ncerdc_schlcode sy
	gen tlp_school = 0
	replace tlp_school = 1 if tlp_first_year == 1 
	replace tlp_school = 1 if tlp_school[_n-1]==1 & ncerdc_lea==ncerdc_lea[_n-1] & ncerdc_schlcode==ncerdc_schlcode[_n-1] & sy==sy[_n-1]+1
	
	keep if ncerdc_lea==XXX // hiding identity
	
	merge m:1 ncerdc_lea ncerdc_schlcode sy using "$basedata/ncerdc_principals"
	drop if _merge==2
	drop _merge
	
	save "$basedata/FOCAL_school_year_data", replace

}


********************************************************************************
* Combine applicant data
********************************************************************************

if `applicantData' == 1 {
	********************************************************************************
	* Create dataset of all applicants.
	* Identifier: applicant_id
	********************************************************************************

	use "$basedata/ncerdc_teacher_tests", clear
	collapse (mean) teacher_test_score_std, by(ncerdc_id)
	ren teacher_test_score_std praxis
	tempfile temptest
	save `temptest', replace

	use "$basedata/ncerdc_education", clear
	gen grad_degree = (educational_attainment>=5)
	collapse (max) grad_degree, by(ncerdc_id)
	tempfile tempeducation
	save `tempeducation', replace


	use "$basedata/ncerdc_teacher_certification", clear
	gen certified = firstyr_cert!=.
	tempfile tempcert
	save `tempcert', replace

	use "$basedata/ncerdc_teacher_license", clear
	ren std_license_fy firstyr_lic
	collapse (min) firstyr_lic, by(ncerdc_id)
	gen licensed = firstyr_lic!=.
	tempfile templic
	save `templic', replace
	
	use "$basedata/FOCAL_applicant_account_info", clear
	
	sort ncerdc_id
	
	sort ncerdc_id
	merge n:1 ncerdc_id using "$basedata/ncerdc_demographics"
	drop if _m==2
	drop _m
	
	merge n:1 ncerdc_id using `temphrms'
	drop if _m==2
	drop _m
	
	merge n:1 ncerdc_id using `temptest'
	drop if _m==2
	drop _m
	gen praxisMissing = praxis==.
	replace praxis = 0 if praxis==.

	merge n:1 ncerdc_id using `tempeducation'
	drop if _m==2
	drop _m
	gen grad_degreeMissing = grad_degree==.
	replace grad_degree = 0 if grad_degree==.

	merge n:1 ncerdc_id using `tempcert'
	drop if _m==2
	drop _m
	gen certifiedMissing = certified==.
	replace certified = 0 if certified==.

	merge n:1 ncerdc_id using `templic'
	drop if _m==2
	drop _m
	gen licensedMissing = licensed==.
	replace licensed = 0 if licensed==.
	
	
	sort applicant_id

	save "$basedata/FOCAL_applicant_data", replace

}

********************************************************************************
* Combine applicant-year data
********************************************************************************

if `applicantYearData' == 1 {
	********************************************************************************
	* Create dataset of all applicant-years.
	* Identifier: applicant_id sy
	********************************************************************************
	
	use "$basedata/FOCAL_applications", clear
	gen num_apps = 1
	gen num_apps_base = base_sample
	gen num_apps_tlp = tlp
	collapse (sum) num_apps num_apps_base num_apps_tlp, by(applicant_id app_year)
	
	qui summ app_year
	local yy0 = r(min)
	local yy1 = r(max)
	forv yy = `yy0' / `yy1' {
	    gen num_apps_`yy' = num_apps if app_year==`yy'
	    gen num_apps_base_`yy' = num_apps_base if app_year==`yy'
		gen num_apps_tlp_`yy' = num_apps_tlp if app_year==`yy'
	}
	
	collapse (sum) num_apps_2* num_apps_base_2* num_apps_tlp_2*, by(applicant_id)
	
	tempfile tempapps
	save `tempapps', replace
	
	
	use "$basedata/FOCAL_applicant_account_info", clear
	
	sort applicant_id
	merge 1:1 applicant_id using `tempapps'
	drop if _m==2
	drop _m

	forv yy = `yy0' / `yy1' {
		replace num_apps_`yy' = 0 if num_apps_`yy'==.
		replace num_apps_base_`yy' = 0 if num_apps_base_`yy'==.
		replace num_apps_tlp_`yy' = 0 if num_apps_tlp_`yy'==.
	}
	
	joinby ncerdc_id using "$basedata/ncerdc_experience", unmatched(master)
	drop _merge
	
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `tempsample'
	assert _m!=2
	gen elem_school_sample = (_m==3)
	drop _m
	
	duplicates report applicant_id sy
	
	preserve
	keep ncerdc_id applicant_id
	duplicates drop
	sort ncerdc_id
	tempfile tempids
	save `tempids', replace
	restore
	
	foreach ss in ma { 
		
	preserve
	use "$basedata/va_estimates_drift", clear
	gen e_return_`ss' = mu_jt_m1_hat_preY_`ss' - mu_t_m1_hat_preY_`ss'
	collapse (mean) e_return_`ss', by(e_`ss')
	keep if e_`ss'!=.
	tempfile tempe_`ss'
	save `tempe_`ss'', replace
	restore
	
	preserve
	use "$basedata/va_estimates_drift_white", clear
	gen e_return_`ss' = mu_jt_m1_hat_preY_`ss' - mu_t_m1_hat_preY_`ss'
	collapse (mean) e_return_`ss', by(e_`ss')
	keep if e_`ss'!=.
	ren (e_*) (e_*_race)
	tempfile tempe_`ss'_race
	save `tempe_`ss'_race', replace
	restore
	
	preserve
	use "$basedata/va_estimates_drift_achHi_ma", clear
	gen e_return_`ss' = mu_jt_m1_hat_preY_`ss' - mu_t_m1_hat_preY_`ss'
	collapse (mean) e_return_`ss', by(e_`ss')
	keep if e_`ss'!=.
	ren (e_*) (e_*_ach)
	tempfile tempe_`ss'_ach
	save `tempe_`ss'_ach', replace
	restore
	
	}
	
	foreach suff in "" "_white" "_achHi_ma" {
	    
	if "`suff'"=="" {
	    local suff2 = ""
	}
	if "`suff'"=="_white" {
	    local suff2 = "_race"
	}
	if "`suff'"=="_achHi_ma" {
	    local suff2 = "_ach"
	}
	
	preserve
	
	use "$basedata/va_estimates_drift`suff'", clear
	
	ren (j t) (ncerdc_id sy)
	
	sort ncerdc_id sy
		
	collapse (sum) n_ct_* (max) multi_school_* (mean) p_m*_* mu_*_m*_hat_* Abar* vijt* e_*, by(ncerdc_id sy)
	drop if ncerdc_id==.
	assert e_ma==floor(e_ma) | e_ma==.
	
	
	ren (n_ct_*) (n_ct_*`suff2')
	ren (multi_school_*) (multi_school_*`suff2')
	ren (p_m*_*) (p_m*_*`suff2')
	ren (mu_*_m*_hat_*) (mu_*_m*_hat_*`suff2')
	ren (Abar*) (Abar*`suff2')
	ren (vijt*)  (vijt*`suff2')
	ren (e_*) (e_*`suff2')
	
	* fill in experience for non-observed years
	qui summ sy
	local minyy = r(min)
	local maxyy = r(max)
	
	foreach ss in ma {
	
		forv t=`minyy'/`maxyy' {
			

			gen years_to_t = sy-`t' if e_`ss'`suff2'!=.
			bys ncerdc_id: egen miny = min(years_to_t)
			bys ncerdc_id: egen maxy = max(years_to_t)
			
			gen e_min = e_`ss'`suff2' if years_to_t==miny
			gen e_max = e_`ss'`suff2' if years_to_t==maxy
			
			bys ncerdc_id: egen e_of_min = min(e_min)
			bys ncerdc_id: egen e_of_max = min(e_max)
			
			gen years_to_t_pos = years_to_t if years_to_t>0
			gen years_to_t_neg = years_to_t if years_to_t<0
			
			bys ncerdc_id: egen minpos = min(years_to_t_pos)
			bys ncerdc_id: egen maxneg = max(years_to_t_neg)
			gen e_min_pos = e_`ss'`suff2' if years_to_t_pos==minpos
			gen e_max_neg = e_`ss'`suff2' if years_to_t_neg==maxneg
			bys ncerdc_id: egen e_of_min_pos = min(e_min_pos)
			bys ncerdc_id: egen e_of_max_neg = min(e_max_neg)
			
			gen gap_t = abs(maxneg)/(minpos-maxneg)
			gen gap_e = (e_of_min_pos-e_of_max_neg)*gap_t + e_of_max_neg
			
			* years prior to all observed experience
			replace e_`ss'`suff2' = max(0,e_of_min-miny) if miny>0 & e_`ss'`suff2'==. & sy==`t'
			* years after all observed experience
			replace e_`ss'`suff2' = e_of_max+(-maxy) if maxy<0 & e_`ss'`suff2'==. & sy==`t'
			* years in between
			replace e_`ss'`suff2' = max(0,floor(gap_e)) if e_`ss'`suff2'==. & maxneg<0 & minpos>0 & maxneg!=. & minpos!=. & sy==`t'
			
			drop years_to_t* miny maxy e_mi* e_max* e_of_min* e_of_max* minpos maxneg gap_t gap_e
		}
	}
	
	foreach ss in ma {
		sort e_`ss'`suff2'
		merge n:1 e_`ss'`suff2' using `tempe_`ss'`suff2''
		drop if _m==2
		drop _m
		
		if "`suff2'"=="" {
		
		replace mu_jt_m1_hat_career_`ss'`suff2' = mu_t_m1_hat_career_`ss'`suff2' + e_return_`ss'`suff2' if mu_jt_m1_hat_career_`ss'`suff2'==.
		replace mu_jt_m2_hat_career_`ss'`suff2' = mu_t_m2_hat_career_`ss'`suff2' + e_return_`ss'`suff2' if mu_jt_m2_hat_career_`ss'`suff2'==.	
		
		}

		replace mu_jt_m1_hat_preY_`ss'`suff2' = mu_t_m1_hat_preY_`ss'`suff2' + e_return_`ss'`suff2' if mu_jt_m1_hat_preY_`ss'`suff2'==.
		replace mu_jt_m2_hat_preY_`ss'`suff2' = mu_t_m2_hat_preY_`ss'`suff2' + e_return_`ss'`suff2' if mu_jt_m2_hat_preY_`ss'`suff2'==.	
	}
	
	sort ncerdc_id sy
	
	joinby ncerdc_id using `tempids', unmatched(none)
	
	tempfile tempva_shrunk`suff2'
	save `tempva_shrunk`suff2'', replace
	restore
	}

	
	sort ncerdc_id applicant_id sy
	merge 1:1 ncerdc_id applicant_id sy using `tempva_shrunk'
	gen va_merge = _m
	drop _m
	
	sort ncerdc_id applicant_id sy
	merge 1:1 ncerdc_id applicant_id sy using `tempva_shrunk_race'
	gen va_merge_race = _m
	drop _m
	
	sort ncerdc_id applicant_id sy
	merge 1:1 ncerdc_id applicant_id sy using `tempva_shrunk_ach'
	gen va_merge_ach = _m
	drop _m
	
	
	* spread some variables to hypothetical years
	sort ncerdc_id applicant_id sy
	foreach var in FOCALEmployeeID zip5 teacher_app_latitude teacher_app_longitude internal	 {
		replace `var' = `var'[_n-1] if ncerdc_id==ncerdc_id[_n-1] & sy>sy[_n-1] & `var'==.
	}
	
	gsort ncerdc_id applicant_id -sy
	
	foreach var in FOCALEmployeeID zip5 teacher_app_latitude teacher_app_longitude internal	 {
		replace `var' = `var'[_n-1] if ncerdc_id==ncerdc_id[_n-1] & sy<sy[_n-1] & `var'==.
	}
	
	keep if applicant_id!=""
	
	sort ncerdc_id sy
	
	replace tchr_exp_pay_level = e_ma if tchr_exp_pay_level==.
	
	gen tempvar = sy!=.
	bys ncerdc_id applicant_id: egen non_missing_sy = max(tempvar)
	drop if sy==. & non_missing_sy==1
	drop tempvar non_missing_sy
	
	merge n:1 ncerdc_id using `temphrms'
	drop if _m==2
	drop _m
	
	gen hrms_app = .
	forvalues yy=2011/2016 {
		replace hrms_app = hrms_app_`yy' if sy==`yy'
		drop hrms_app_`yy'
	}

	
	
	sort FOCALEmployeeID sy
	merge n:1 FOCALEmployeeID sy using "$basedata/FOCAL_teacher_year_evaas"
	drop if _m==2
	drop _m
	

	sort applicant_id sy

	save "$basedata/FOCAL_applicant_year_data", replace

}


********************************************************************************
* Combine vacancies data
********************************************************************************

if `vacanciesData' == 1 {
	********************************************************************************
	* Create dataset of all vacancies.
	* Identifier: job_id
	********************************************************************************
	
	use "$basedata/FOCAL_applications", clear
	
	destring ncerdc_lea, force replace
	
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `tempsample'
	assert _m!=2
	gen elem_school_sample = (_m==3)
	drop _m

	
	collapse (min) base_sample tlp tlp_pool tlp_job tlp_pos_type earliest_app latest_app app_year elem_school_sample, by(job_id job_name ncerdc_lea ncerdc_schlcode school_code job_school job_type)
	
	rename app_year sy
	
	merge m:1 ncerdc_lea ncerdc_schlcode sy using "$basedata/ncerdc_principals"
	drop if _merge==2
	drop _merge
	
	rename sy app_year
	

	save "$basedata/FOCAL_vacancies", replace

}

********************************************************************************
* Combine applications data
********************************************************************************

if `applicationsData' == 1 {
	********************************************************************************
	* Create dataset of all applications.
	* Identifier: applicant_id job_id
	********************************************************************************
	
	use "$basedata/FOCAL_school_year_data", clear
	keep if ncerdc_lea==XXX // hiding identity
	keep ncerdc_lea ncerdc_schlcode elem_school_sample tlp_school sy
	
	gen tlp_school_year = sy if tlp_school==1
	
	collapse (min) min_tlp_year=tlp_school_year (max) max_tlp_year=tlp_school_year elem_school_sample, by(ncerdc_lea ncerdc_schlcode)
	
	duplicates drop
	sort ncerdc_lea ncerdc_schlcode
	
	tempfile tempccd
	save `tempccd', replace
	
	use "$basedata/FOCAL_applications", clear
	
	destring ncerdc_lea, force replace
	
	
	* check whether the position is a school in the common core
	sort ncerdc_lea ncerdc_schlcode
	merge n:1 ncerdc_lea ncerdc_schlcode using `tempccd'
	drop if _m==2
	gen job_at_school = (_m==3)
	drop _m
	
	gen tlp_school = (app_year>=min_tlp_year-1 & app_year<=max_tlp_year-1 & min_tlp_year!=. & max_tlp_year!=.)
	drop min_tlp_year max_tlp_year
	
	
	rename app_year sy
	
	merge m:1 ncerdc_lea ncerdc_schlcode sy using "$basedata/ncerdc_principals"
	drop if _merge==2
	drop _merge
	
	rename sy app_year
	
	preserve
		use "$basedata/FOCAL_applicant_account_info", clear
		keep applicant_id ncerdc_id
		isid applicant_id
		drop if ncerdc_id == .
		
		tempfile temp_applicant_ncerdc_ids
		save `temp_applicant_ncerdc_ids', replace
	restore
	
	merge m:1 applicant_id using `temp_applicant_ncerdc_ids'
	drop if _merge==2
	drop _merge

	sort applicant_id job_id

	save "$basedata/FOCAL_applications_data", replace

}




